In [1]:
import numpy as np
import plotly.express as px
import os
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import umap
from pyensembl import EnsemblRelease
from itertools import product
from Bio.Seq import translate
import pickle

import ibis
ibis.set_backend("duckdb")
ibis.options.interactive = True
from ibis import _
import ibis.selectors as s
import warnings
warnings.filterwarnings('ignore')
/Users/jordanramsdell/mambaforge/envs/ml_ibis/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
  from .autonotebook import tqdm as notebook_tqdm

Goal¶

  • We're going to look more closely at the targets database, specifically protein-coding targets, and embed them

Prelude (defining some functions for later)¶

In [2]:
def construct_databases(base_loc):
    mappings = {}
    for directory in os.listdir(base_loc):
        if directory.startswith("."):
            continue
        loc = base_loc + "/" + directory
        t = ibis.read_parquet(loc)
        mappings["t_" + directory] = t
    return mappings

# Load parquet databases into local variables
locals().update(construct_databases("../../../data/open_targets/"))


def vectorize_and_embed(docs, n_components=3, use_densmap=False, 
                        metric='euclidean', n_neighbors=15, vectorizer_fun=lambda: CountVectorizer(stop_words='english')):
    counts = vectorizer_fun().fit_transform(docs)
    mapper = umap.UMAP(n_components=n_components, densmap=use_densmap, metric=metric, random_state=42).fit(counts)
    return mapper
    
def construct_scatterplot(df, mapper, hover_name, color=None, hover_data=None):
    embeddings = mapper.embedding_.T
    df["x"], df["y"], df["z"] = embeddings
        
    fig = px.scatter_3d(df, x="x", y="y", z="z", color=color,
                        hover_name=hover_name, hover_data=hover_data)
    fig.update_layout(margin=dict(l=0, r=0, t=0, b=0))
    fig.update_traces(marker=dict(size=2))
    return fig.show()

Data Prep¶

Subcellular Locations (first term only; used for coloring)¶

  • Subcellular location conerns where the product of a gene tends to be located.
    • Has a formal ontology. More info here: Subcellular location | UniProt help | UniProt
  • Genes can be associated with more than one subcellular location.
    • Since I am using this to color/label plots, I only consider the FIRST location for each gene.
    • So this will not be representative of all the locations
In [3]:
# For this demo, only considering protein-coding targets
protein_coding_targets = t_targets.filter(_.biotype == 'protein_coding')

subcell_by_id = (protein_coding_targets
 .select("id", _.subcellularLocations.unnest())
 .unpack("subcellularLocations")
 .filter(~ _.location.startswith("["))
 .group_by("id")
 .agg(location = _.location.first()))

subcell_by_id
Out[3]:
┏━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┓
┃ id              ┃ location             ┃
┡━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━┩
│ string          │ string               │
├─────────────────┼──────────────────────┤
│ ENSG00000076555 │ Mitochondrion        │
│ ENSG00000095794 │ Nucleus              │
│ ENSG00000102755 │ Endosome             │
│ ENSG00000122566 │ Nucleus              │
│ ENSG00000135722 │ Golgi apparatus      │
│ ENSG00000163638 │ Secreted             │
│ ENSG00000169371 │ Nucleus              │
│ ENSG00000198040 │ Nucleus              │
│ ENSG00000040531 │ Melanosome membrane  │
│ ENSG00000136270 │ Mitochondrion matrix │
│ …               │ …                    │
└─────────────────┴──────────────────────┘

Aggregating GO-term descriptions associated with each gene¶

  • Genes can have one or more GO terms
  • We're going to use the text in those GO terms to do embeddings (using Bag of Words model)
In [4]:
go_name_to_gene_id = (protein_coding_targets
 .select(id=_.id, go_id=_.go.unnest().id)
 .inner_join(t_go, _.go_id == t_go.id)
 .group_by("id")
 .agg(go_desc = _.name.collect().join(" ")))

go_name_to_gene_id
Out[4]:
┏━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
┃ id              ┃ go_desc                                                                          ┃
┡━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
│ string          │ string                                                                           │
├─────────────────┼──────────────────────────────────────────────────────────────────────────────────┤
│ ENSG00000116748 │ IMP salvage AMP deaminase activity metal ion binding IMP biosynthetic process G… │
│ ENSG00000138755 │ defense response to virus positive regulation of release of sequestered calcium… │
│ ENSG00000141519 │ cytoplasm motile cilium assembly flagellated sperm motility cilium axoneme cili… │
│ ENSG00000165118 │ tRNA-guanine transglycosylation molecular_function cellular_component biologica… │
│ ENSG00000166181 │ membrane RNA binding cytoplasm fibroblast apoptotic process negative regulation… │
│ ENSG00000170006 │ membrane protein binding                                                         │
│ ENSG00000198203 │ sulfotransferase activity cytoplasm lysosome aryl sulfotransferase activity ami… │
│ ENSG00000214193 │ plasma membrane nucleoplasm cell migration actin filament organization           │
│ ENSG00000064199 │ sperm fibrous sheath binding of sperm to zona pellucida external side of plasma… │
│ ENSG00000140326 │ endomembrane system protein localization plasma membrane cytoplasm chromatin or… │
│ …               │ …                                                                                │
└─────────────────┴──────────────────────────────────────────────────────────────────────────────────┘

Combing results into final table¶

In [5]:
query = (protein_coding_targets
 .left_join(subcell_by_id, "id", rname="cell_id")
 .left_join(go_name_to_gene_id, "id", rname="go_id")
 .mutate(functionDescriptions=_.functionDescriptions.join(" "))
 .select("id", "approvedName", "go_desc", "functionDescriptions", "location")
 .mutate(truncDesc = _.functionDescriptions[0:50])
 .fillna({"location":"N/A", "functionDescriptions":"N/A", "go_desc":"N/A"})
 .filter(_.go_desc != "N/A" 
             and _.location != "N/A"
             and _.functionDescriptions != "N/A"))
query
Out[5]:
┏━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
┃ id              ┃ approvedName                                            ┃ go_desc                                                                          ┃ functionDescriptions                                                             ┃ location                     ┃ truncDesc                                          ┃
┡━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
│ string          │ string                                                  │ string                                                                           │ string                                                                           │ string                       │ string                                             │
├─────────────────┼─────────────────────────────────────────────────────────┼──────────────────────────────────────────────────────────────────────────────────┼──────────────────────────────────────────────────────────────────────────────────┼──────────────────────────────┼────────────────────────────────────────────────────┤
│ ENSG00000059588 │ TAR (HIV-1) RNA binding protein 1                       │ tRNA (guanine) methyltransferase activity regulation of transcription by RNA po… │ Probable S-adenosyl-L-methionine-dependent methyltransferase which methylates R… │ Nuclear speckles             │ Probable S-adenosyl-L-methionine-dependent methylt │
│ ENSG00000072071 │ adhesion G protein-coupled receptor L1                  │ latrotoxin receptor activity G protein-coupled receptor signaling pathway plasm… │ Calcium-independent receptor of high affinity for alpha- latrotoxin, an excitat… │ Cell membrane                │ Calcium-independent receptor of high affinity for  │
│ ENSG00000073536 │ notchless homolog 1                                     │ skeletal system morphogenesis positive regulation of canonical Wnt signaling pa… │ Plays a role in regulating Notch activity. Plays a role in regulating the expre… │ Nucleus                      │ Plays a role in regulating Notch activity. Plays a │
│ ENSG00000075290 │ Wnt family member 8B                                    │ canonical Wnt signaling pathway signal transduction nervous system development … │ Ligand for members of the frizzled family of seven transmembrane receptors. May… │ Secreted                     │ Ligand for members of the frizzled family of seven │
│ ENSG00000083454 │ purinergic receptor P2X 5                               │ positive regulation of calcium ion transport into cytosol positive regulation o… │ Receptor for ATP that acts as a ligand-gated ion channel.                        │ Membrane                     │ Receptor for ATP that acts as a ligand-gated ion c │
│ ENSG00000083782 │ epiphycan                                               │ articular cartilage development glycosaminoglycan binding bone development extr… │ May have a role in bone formation and also in establishing the ordered structur… │ Secreted                     │ May have a role in bone formation and also in esta │
│ ENSG00000087087 │ serrate, RNA effector molecule                          │ primary miRNA processing regulation of DNA-templated transcription DNA binding … │ Acts as a mediator between the cap-binding complex (CBC) and the primary microR… │ Nucleus                      │ Acts as a mediator between the cap-binding complex │
│ ENSG00000087502 │ ERGIC and golgi 2                                       │ retrograde vesicle-mediated transport, Golgi to endoplasmic reticulum transport… │ Possible role in transport between endoplasmic reticulum and Golgi. .            │ Endoplasmic reticulum        │ Possible role in transport between endoplasmic ret │
│ ENSG00000092201 │ SPT16 homolog, facilitates chromatin remodeling subunit │ nucleoplasm nucleoplasm FACT complex RNA binding nucleoplasm nucleoplasm transc… │ Component of the FACT complex, a general chromatin factor that acts to reorgani… │ Nucleus                      │ Component of the FACT complex, a general chromatin │
│ ENSG00000102078 │ solute carrier family 25 member 14                      │ plasma membrane mitochondrial inner membrane mitochondrial inner membrane mitoc… │ Participates in the mitochondrial proton leak measured in brain mitochondria.    │ Mitochondrion inner membrane │ Participates in the mitochondrial proton leak meas │
│ …               │ …                                                       │ …                                                                                │ …                                                                                │ …                            │ …                                                  │
└─────────────────┴─────────────────────────────────────────────────────────┴──────────────────────────────────────────────────────────────────────────────────┴──────────────────────────────────────────────────────────────────────────────────┴──────────────────────────────┴────────────────────────────────────────────────────┘
In [6]:
df = query.execute()

Gene transcripts (and translated proteins)¶

  • Use gene ids (from Open Targets) to retrieve transcripts from Ensembl (using a convenient library!)
    • To install library, you need to follow extra steps. See here: pyensembl · PyPI
  • For this demo:
    • Only considering the first transcript associated with the gene (typically it's the primary one)
    • Only considering human transcripts with Ensembl IDs (thankfully, this is most of them!)

Other notes:

  • contig_ids: The contig (which comes from the GRCh38 human assembly) that the transcript is located on. Mostly used for debugging/coloring
In [7]:
ensembl_data = EnsemblRelease(77)

def retrieve_transcript_sequence(i):
    try:
        contig = ensembl_data.gene_by_id(i)
        sequence = contig.transcripts[0].sequence # only use the first transcript
        contig_num = contig.transcripts[0].contig
        contig_num = contig_num if contig_num.isnumeric() else "Other"
        return (sequence, contig_num)
    except ValueError:
        return ("N/A", "N/A")
    
    
def do_translate(nucleotide_seq):
    try:
        return translate(nucleotide_seq)
    except: # I'm lazy
        return "N/A"
In [8]:
# Retrieve nucletide sequences and filter out IDs that we could not retrieve
df["nucleotide"], df["contig"] = zip(*[retrieve_transcript_sequence(i) for i in df["id"]])
df = df[df["nucleotide"] != "N/A"]

# Translate into proteins and filter out N/A (not perfect; has some issues because not all have a complete reading frame, so there ae some stops)
# It at least gives us something to embed with for now!
df["protein"] = [do_translate(i) for i in df["nucleotide"]]
df = df[df["protein"] != "N/A"]
INFO:pyensembl.sequence_data:Loaded sequence dictionary from /Users/jordanramsdell/Library/Caches/pyensembl/GRCh38/ensembl77/Homo_sapiens.GRCh38.cdna.all.fa.gz.pickle
INFO:pyensembl.sequence_data:Loaded sequence dictionary from /Users/jordanramsdell/Library/Caches/pyensembl/GRCh38/ensembl77/Homo_sapiens.GRCh38.ncrna.fa.gz.pickle
In [9]:
# Adding index (for debugging purposes)
df["index"] = list(range(len(df)))

Mapper Training / Visualization¶

Gene descriptions -> Embedding¶

  • Using tokenizer again; the gene functiond descriptions aren't that long, so not expecting embeddings to be that good at differentiating between genes
In [10]:
gene_desc_mapper = vectorize_and_embed(df["functionDescriptions"], 
                                       metric='hellinger', 
                                       vectorizer_fun=lambda: TfidfVectorizer(stop_words='english', norm='l1', max_features=1000))

Gene Descriptions -> Visualization¶

  • As we can see, not the best embedding. Clumping behavior probably because of low-quality/insufficient gene descriptions.
  • Note the olfactory receptor cluster. You'll see that a lot in the later embeddings. They appear distinct.
In [11]:
construct_scatterplot(df, gene_desc_mapper, hover_name="approvedName", 
                      color="location", hover_data=["id", "contig", "truncDesc", "index"])

Gene Go-Terms -> Embedding¶

  • Since we're vectorizing concatenated GO descs (instead of their IDs), hellinger metric also works here
  • If we had used just the GO IDs, binary metric may be better (because the IDs can't appear more than once per gene).
In [12]:
gene_go_mapper = vectorize_and_embed(df["go_desc"], metric='hellinger', 
                                     vectorizer_fun=lambda: TfidfVectorizer(norm="l1"))

Gene Go-Terms -> Visualization¶

  • Not surprising that this look better. GO terms were more relevant
    • Also, subcellular locations are related to GO terms, so we'd expect the coloring to line up here.
In [13]:
construct_scatterplot(df, gene_go_mapper, hover_name="approvedName", 
                      color="location", hover_data=["id", "contig", "truncDesc", "index"])

Gene Nucleotide K-mers -> Embedding¶

  • This time, we're using K-mers (1 to 3-mers).
  • TfidfVectorizer (and CountVectorizer) support this by using ngram_range
  • A 1-gram would just be counts of "A", "T", "C", "G"
  • A 2-gram would be counts of "AT", "AC", "TA", ... etc. over a moving window
  • For more info, see: k-mer - Wikipedia
In [14]:
gene_nucleotide_mapper = vectorize_and_embed(df["nucleotide"], metric='hellinger', 
                                     vectorizer_fun=lambda: TfidfVectorizer(analyzer="char", norm="l1", 
                                                                            ngram_range=(1, 3),  
                                                                            lowercase=False))

Gene Transcript K-mers -> Visualization¶

  • Again, the olfactory genes stand out. Same with that histone cluster.
  • Overall, this is a more smooth / evenly distributed space.
  • Olfactory receptor family sticks out
In [15]:
construct_scatterplot(df, gene_nucleotide_mapper, hover_name="approvedName", 
                      color="location", hover_data=["id", "contig", "truncDesc", "index"])

Gene Protein K-mers -> Embedding¶

  • Same idea as before, but this time with amino acids.
  • Using only 1-mers and 2-mers here (there are many combinations in contrast to nucleotides)
In [16]:
gene_protein_mapper = vectorize_and_embed(df["protein"], metric='hellinger', 
                                     vectorizer_fun=lambda: TfidfVectorizer(analyzer="char", 
                                                                            norm="l1", 
                                                                            ngram_range=(1, 2), 
                                                                            lowercase=False))

Gene Protein K-mers -> Visualization¶

  • Again... olfactory cluster stands out.
  • Overall, much more dense/tighly packed than the nucleotide embedding
In [17]:
construct_scatterplot(df, gene_protein_mapper, hover_name="approvedName", 
                      color="location", hover_data=["id", "contig", "truncDesc", "index"])

Exporting mappers (used for prediction in another notebook)¶

In [18]:
pickle.dump(df, open("models/gene_df.sav", 'wb'))
pickle.dump(gene_desc_mapper, open("models/gene_desc_mapper.sav", 'wb'))
pickle.dump(gene_go_mapper, open("models/gene_go_mapper.sav", 'wb'))
pickle.dump(gene_nucleotide_mapper, open("models/gene_nucleotide_mapper.sav", 'wb'))
pickle.dump(gene_protein_mapper, open("models/gene_protein_mapper.sav", 'wb'))